%matplotlib inline
import pandas as pd
import numpy as np
#Visualization
#!pip install plotly_express
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
#Modeling and evaluation
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
sns.set_style("darkgrid")
OWIDdf = pd.read_csv('https://github.com/owid/covid-19-data/raw/master/public/data/owid-covid-data.csv')
OWIDdfc = OWIDdf.copy()
OWIDdfc.head()
OWIDdfc.describe()
#many nonsensical negative values like -min in new_deaths columns.
df = OWIDdfc.copy()
df.new_cases.min()
Absolute values of columns with negative values
#new_cols = [col for col in df.columns if 'new' in col]
#new_cols
#df[new_cols].describe()
#df[new_cols] = abs(df[new_cols])
#df[new_cols].describe()
df.info()
#some of this information is too sparse to analyse, and is somewhat redundant when we are already analysing death rates
df.drop(columns=df.columns[17:25].tolist(), inplace=True)
df_dropcol = df.copy()
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
#Preprocessing
df_world = df[df.location=='World']
df = df[-df.location.isin(['World'])]
odf = df.copy()
odf1 = df.copy()
#creating crude mortality features
odf['crude_mortality'] = (odf.total_deaths/odf.population)
odf.tail()[['location','crude_mortality']]
fig, ax = plt.subplots(figsize=(10,7))
for x, y in enumerate([df_world.new_cases_smoothed]):
y.plot(ax=ax, linestyle='-')
for x, y in enumerate([df_world.new_deaths_smoothed]):
y.plot(ax=ax, linestyle='-')
ax.legend(fontsize='x-large')
ax.set(xlabel = '', ylabel='Cases', title = 'Worldwide cases in 2020')
odf_canada = odf[odf.location=='Canada']
fig, ax = plt.subplots(3, 1, figsize=(15,10), sharex=True)
for x, y in enumerate([odf_canada.total_cases]):
y.plot(ax=ax[0], linestyle='-')
for x, y in enumerate([odf_canada.total_deaths]):
y.plot(ax=ax[0], linestyle='-', color='r')
for x, y in enumerate([odf_canada.total_deaths]):
y.plot(ax=ax[1], linestyle='-', color='r')
for x, y in enumerate([odf_canada.stringency_index]):
y.plot(ax=ax[2], linestyle='-')
ax[0].set(xlabel = '', ylabel='Population', title = 'Canada: new cases and deaths')
ax[1].set(xlabel = '', ylabel='Population', title = 'Canada: new deaths scaled')
ax[2].set(xlabel = '', ylabel='Stringency Index', title = 'Canada: Stringency Index')
ax[0].legend(loc='upper left', fontsize='x-large')
ax[0].annotate('February 7, 2021',
xy=('2021-02-11', odf_canada.total_cases['2021-02-25']-20000),
xycoords = 'data', xytext=('2021-02-01',580000),
arrowprops=dict(facecolor='black'),
horizontalalignment='left',
verticalalignment='top')
ax[0].annotate('First reported case of P.1 Variant',
xy=('2021-02-11', odf_canada.total_cases['2021-02-25']),
xycoords = 'data', xytext=('2021-01-15',450000))
def vtplot(case='Canada'):
fig, ax = plt.subplots(figsize=(14,7))
for x, y in enumerate([df[df.location==case].total_cases.dropna()]):
y.plot(ax=ax, linestyle='-')
#for x, y in enumerate([df[df.location==case].total_deaths.dropna()]):
# y.plot(ax=ax, linestyle='-', color = 'r')
for x, y in enumerate([df[df.location==case].people_vaccinated.dropna()]):
y.plot(ax=ax, linestyle='-')
for x, y in enumerate([df[df.location==case].total_vaccinations.dropna()]):
y.plot(ax=ax, linestyle='-')
ax.set_title('Vaccines and cases in '+case, fontsize='xx-large')
ax.set_ylabel('Population')
ax.legend(loc = 'upper left', fontsize='x-large')
vtplot('North America')
vtplot('South America')
vtplot('Asia')
vtplot('Africa')
vtplot('Europe')
vtplot('Oceania')
vtplot('India')
vtplot('United States')
vtplot('Brazil')
vtplot('Europe')
vtplot('United Kingdom')
vtplot('Japan')
vtplot('South Korea')
vtplot('Philippines')
vtplot('North America')
vtplot('Asia')
fig, ax = plt.subplots(figsize=(15,10))
continents = ['South America','North America','Asia','Africa','Europe','Oceania']#No Antarctica information available
for i in continents:
for x, y in enumerate([odf[odf.location==i].total_cases]):
y.plot(ax=ax, linestyle='-')
ax.set(xlabel = '', ylabel='Population', title = 'Worldwide new cases by continent')
ax.legend(continents, fontsize='x-large')
fig, ax = plt.subplots(figsize=(15,9))
continents = ['South America','North America','Asia','Africa','Europe','Oceania']#No Antarctica information available
for i in continents:
for x, y in enumerate([odf[odf.location==i].total_deaths]):
y.plot(ax=ax, linestyle='-')
ax.set(xlabel = '', ylabel='Deaths', title = 'Worldwide deaths by continent')
ax.legend(continents, loc='upper left',fontsize= 'x-large')
odf_nocontinent = odf[-odf.location.isin(['World','European Union','North America','South America','Asia','Africa','Europe','Oceania'])] # Removing Continents
odf_totalcases = odf_nocontinent.groupby(odf_nocontinent['location']).agg(['max'])
odf_totalcases.columns = odf_totalcases.columns.droplevel(1)
fig = px.choropleth(odf_totalcases, locations='iso_code',
color='total_cases',
hover_name=odf_totalcases.index,
color_continuous_scale=px.colors.sequential.deep)
fig.update_geos(fitbounds='locations', visible=False)
fig.show()
fig = px.choropleth(odf_totalcases, locations='iso_code',
color='total_deaths',
hover_name=odf_totalcases.index,
hover_data=['population_density','stringency_index','human_development_index'],
color_continuous_scale=px.colors.sequential.deep)
fig.update_geos(fitbounds='locations', visible=False)
fig.show()
fig = px.choropleth(odf_totalcases, locations='iso_code',
color=odf_totalcases.median_age,
hover_name=odf_totalcases.index,
color_continuous_scale=px.colors.diverging.curl,
color_continuous_midpoint=20)
fig.update_geos(fitbounds='locations', visible=False)
fig.show()
odf_ma40 = odf[odf.median_age>40]
odf_ma20 = odf[odf.median_age<20]
top5_ma = odf_totalcases.sort_values(by='median_age', ascending=False).head(5).index.tolist()
bot5_ma = odf_totalcases.sort_values(by='median_age', ascending=True).head(5).index.tolist()
fig, ax = plt.subplots(2,1,figsize=(12,8), sharex=True)
for i in top5_ma:
for x, y in enumerate([odf[odf.location==i].total_cases_per_million]):
y.plot(ax=ax[0], linestyle='-')
for i in bot5_ma:
for x, y in enumerate([odf[odf.location==i].total_cases_per_million]):
y.plot(ax=ax[0], linestyle='dotted')
for i in top5_ma:
for x, y in enumerate([odf[odf.location==i].total_deaths_per_million]):
y.plot(ax=ax[1], linestyle='-')
for i in bot5_ma:
for x, y in enumerate([odf[odf.location==i].total_deaths_per_million]):
y.plot(ax=ax[1], linestyle='dotted')
ax[0].set(xlabel = '', ylabel='Population', title= 'Total Cases per million')
#ax[0].set_title('Total Cases per million',fontsize=20)
ax[0].legend(top5_ma+bot5_ma, fontsize='large')
ax[1].set(xlabel = '', ylabel='Population', title='Total Deaths per million')
#ax[1].set_title('Total Deaths per million',fontsize=20)
plt.tight_layout()
zf = pd.DataFrame(df.groupby(by=['iso_code', 'location']).agg(['mean']))
zf.reset_index(inplace=True)
zf.columns = zf.columns.droplevel(1)
zf.stringency_index.isnull().sum()
fig = px.choropleth(zf, locations='iso_code',
color='stringency_index',
hover_name='location',
color_continuous_scale=px.colors.sequential.deep)
fig.update_geos(fitbounds='locations', visible=False)
fig.show()
list_pd = odf_totalcases.sort_values('population_density', ascending=False).head(5).index.tolist()
odf2 = odf_totalcases.copy()
odf2.drop(labels=list_pd, axis=0, inplace=True)
zf.sort_values(by='stringency_index', ascending=False).head(10)[['location','stringency_index','total_deaths_per_million','total_cases_per_million']]
zf.sort_values(by='stringency_index', ascending=True).head(10)[['location','stringency_index','total_deaths_per_million','total_cases_per_million']].dropna()
fig = px.choropleth(odf2, locations='iso_code',
color='population_density',
hover_name=odf2.index,
color_continuous_scale=px.colors.sequential.deep)
fig.update_geos(fitbounds='locations', visible=False)
fig.show()
hbdf = odft.sort_values(by='hospital_beds_per_thousand', ascending=False)[['hospital_beds_per_thousand','total_deaths_per_million']].dropna()
fig, host = plt.subplots(figsize=(12,8))
par1 = host.twinx()
host.set_xlim(-5, 165)
host.set_xticks([])
host.set_ylim(-1, 14)
par1.set_ylim(-100, 2500)
host.set_xlabel('countries')
host.set_ylabel('Hospital beds per thousand')
par1.set_ylabel('total deaths per million')
p1 = host.plot(np.array(hbdf.hospital_beds_per_thousand.values), color='g')
p2 = par1.scatter(x=np.arange(0,160),y=np.array(hbdf.total_deaths_per_million.values))
host.legend(labels=['hospital beds per thousand'], loc='upper left', fontsize='x-large')
par1.legend(labels=['total_deaths'], loc='upper right', fontsize='x-large')
Counterintuitively, the trend shows that less hospital capacity is correlated somewhat to a lower total death count. This means mean there are other factors to be considered.
fig = px.choropleth(odf_totalcases, locations='iso_code',
color='hospital_beds_per_thousand',
hover_name=odf_totalcases.index,
color_continuous_scale=px.colors.sequential.deep)
fig.update_geos(fitbounds='locations', visible=False)
fig.show()
odf_totalcases[odf_totalcases.total_cases_per_million.between(2000,10000)][['total_deaths_per_million','total_cases_per_million','population','population_density','hospital_beds_per_thousand']].sort_values(by='total_deaths_per_million').dropna()
pd.set_option('display.float_format', '{:.2f}'.format)
hbpt = odf_totalcases[odf_totalcases.population.between(100000000,200000000)][['total_deaths_per_million','total_cases_per_million','population','population_density','hospital_beds_per_thousand']].sort_values(by='total_deaths_per_million')
hbpt.sort_values(by='hospital_beds_per_thousand', ascending=False)
odf_hb7 = odf[odf.hospital_beds_per_thousand>6]
odf_hb6 = odf[odf.hospital_beds_per_thousand<6]
top5_hbpt = odf_totalcases.sort_values(by='hospital_beds_per_thousand', ascending=False).head(5).index.tolist()
bot5_hbpt = odf_totalcases.sort_values(by='hospital_beds_per_thousand', ascending=True).head(5).index.tolist()
bot5_hbpt
fig, ax = plt.subplots(2,1, figsize=(12,8), sharex=True)
for i in top5_hbpt:
for x, y in enumerate([odf[odf.location==i].total_deaths_per_million]):
y.plot(ax=ax[0], linestyle='-')
for i in bot5_hbpt:
for x, y in enumerate([odf[odf.location==i].total_deaths_per_million]):
y.plot(ax=ax[0], linestyle='dotted')
for i in top5_hbpt:
for x, y in enumerate([odf[odf.location==i].total_deaths_per_million]):
y.plot(ax=ax[1], linestyle='-')
for i in bot5_hbpt:
for x, y in enumerate([odf[odf.location==i].total_deaths_per_million]):
y.plot(ax=ax[1], linestyle='dotted')
ax[0].set(xlabel = '', ylabel='Population', title = 'Total cases per million')
ax[0].legend(top5_hbpt+bot5_hbpt, fontsize='large')
ax[1].set(xlabel = '', ylabel='Population', title = 'Total deaths per million')
fig = px.choropleth(odf_totalcases, locations='iso_code',
color='extreme_poverty',
hover_name=odf_totalcases.index,
color_continuous_scale=px.colors.sequential.deep)
fig.update_geos(fitbounds='locations', visible=False)
fig.show()
#Removing redundant columns
#some columns are removed due to multicollinearity
OWIDdf = OWIDdf.drop(columns=['aged_65_older','aged_70_older','iso_code','continent'])
odft=odf_totalcases.copy()
list_ts = [col for col in odft.columns if 'new' in col or 'total' in col or 'people' in col]
odft = odft.drop(columns=['aged_65_older','aged_70_older','iso_code','continent','reproduction_rate','positive_rate','tests_per_case']+list_ts)
odft.corr()
save = sns.heatmap(odft.corr()).set_title('OWID dataset Correlation Heatmap')
save.figure.savefig('heatmap.png')
sns.heatmap(odf_totalcases.corr()[['total_cases_per_million']].sort_values(by='total_cases_per_million', ascending=False),
annot=True, cmap='BrBG').set_title('Features correlating with total_cases')
dhmp = sns.heatmap(odf_totalcases.corr()[['total_deaths_per_million']].sort_values(by='total_deaths_per_million', ascending=False), annot=True, cmap='BrBG')
dhmp.set_title('Features correlating with total_deaths')
nowiddfn = odf.select_dtypes(include='float64')
#ppdf = sns.pairplot(nowiddfn.sample(1000))
#ppdf.savefig("pairplotowid.png")
Some of the target variables to consider: reproduction_rate, total_cases_per_million, total_deaths_per_million. Normally we would also consider crude mortality rate or case-fatality rate as well. However, confirmed cases is often underreported AND undertested and as such may be harder to evaluate accurately. It is important to consider that the data being robust and accurate relies heavily on the nation's testing capacity.
Remember that the reproduction rate (R) describes the trajectory of the virus. A value of R = 1 means the amount of new infections and new recoveries are equal; meaning the virus numbers will stagnate. A value of 6.74 means the number of infected is sharply increasing and may lead in a big spike of infections and deaths, depending on government mitigation strategy.
print(OWIDdf.reproduction_rate.max())
odf[odf.reproduction_rate==OWIDdf.reproduction_rate.max()]
odft.info()
South Korea reported one of the largest spikes in the early stages of the pandemic, yet they are one of the countries with the lowest COVID-19 Mortality, which may be attributed to hospital bed capacity.
Italy may have suffered more loss due to their higher median age.
Crude Mortality or Case fatality may be used a generated target feature vs total_deaths_per_million. However, because case-fatality veracity and consistency is heavily dependent on testing capacity of the region, this feature will be discarded, and crude_mortality will be the focus.
odfc = odf_totalcases.copy()
odfc['crude_mortality'] = (odfc.total_deaths/odfc.population)
odfc.crude_mortality.describe()
#if using crude_mortality, belgium actually performs the worst
odfc[odfc.crude_mortality == odfc.crude_mortality.max()][['population','crude_mortality']]
odfc.sort_values(by='crude_mortality', ascending = False).head(10)[['crude_mortality']]
odfc.sort_values(by='crude_mortality', ascending = True).head(10)[['crude_mortality']]
odfc[odfc.index=='Tanzania'][['total_cases','total_deaths','population','crude_mortality']]
Many African Countries did very well in response to the virus.
odfdt = odfdt[odfdt['total_cases_per_million'].notna()]
odfdt = odfdt[odfdt['population_density'].notna()]
odf1 = df.groupby(df['location']).agg(['max'])
odf1.columns = odf1.columns.droplevel(1)
odf1.reset_index(level=0, inplace=True)
odf1.sort_values(by='total_vaccinations', ascending=False).head(10)[['location']].values.tolist()
def timeplot(loc, case1, case2):
fig, ax = plt.subplots(figsize=(14,7))
for x, y in enumerate([df[df.location==loc][case1]]):
y.plot(ax=ax, linestyle='-')
for x, y in enumerate([df[df.location==loc][case2]]):
y.plot(ax=ax, linestyle='-', color = 'r')
#for x, y in enumerate([odf_canada.total_tests]):
# y.plot(ax=ax, linestyle='-')
#for x, y in enumerate([odf_canada.total_vaccinations]):
# y.plot(ax=ax, linestyle='-')
ax.set_title(''+case1+' vs '+case2, fontsize='xx-large')
ax.legend(loc = 'upper left', fontsize='x-large')
timeplot(loc='United States', case1='new_vaccinations_smoothed', case2='total_cases')
timeplot(loc='India', case1='new_vaccinations_smoothed', case2='total_cases')
timeplot(loc='Canada', case1='total_cases_per_million', case2='handwashing_facilities')
odf1 = odf1[odf1.population.notna()]
odf1 = odf1[odf1.total_cases.notna()]
odf1 = odf1[odf1.population_density.notna()]
odf1 = odf1[odf1.human_development_index.notna()]
odf1 = odf1[odf1.gdp_per_capita.notna()]
odf1 = odf1[-odf1.location.isin(['World','European Union','North America','South America','Asia','Africa','Europe','Oceania'])]
odfpx = odf1.sort_values(by='population_density', ascending=False).head(10)
px.scatter(odf1, x='population',y='population_density', size='total_deaths_per_million',
color ='continent', hover_name='location',
title='pop vs pop. density vs total deaths per million')
px.scatter(odf1, x='life_expectancy',y='human_development_index', size='gdp_per_capita',
color ='continent', hover_name='location',
title='HDI vs Life_expectancy vs gdp_per_capita')
odf1.info()
odf1 = odf1[odf1.total_deaths_per_million.notna()]
odf1[['female_smokers','male_smokers']]
odf1['smokers'] = odf1.female_smokers + odf1.male_smokers
px.scatter(odf1, x='cardiovasc_death_rate',y='diabetes_prevalence', size='total_deaths_per_million',
color ='continent', hover_name='location',
title='cardiovasc vs diabetes vs total_deaths_per_million')
px.scatter(odf1, x='male_smokers',y='female_smokers', size='total_deaths_per_million',
color ='continent', hover_name='location',
title='msmoke vs fsmoke vs total_deaths_per_million')
odf1 = odf1[odf1.median_age.notna()]
odf1.sort_values(by='population_density', ascending=False)[['location','population_density']]
odf_totalcases.sort_values(by='aged_65_older', ascending=False).head(10)
px.scatter(odf1, x='population',y='population_density', size='median_age',
color ='continent', hover_name='location',
title='cardiovasc, diabetes, smokers')
if the population of the country is under 1 million, the value of total_cases_per_million is scaled up proportionally and may be inaccurate
Population density does not look like a large contributing factor to crude mortality rate, when looking at the higher total_deaths_per_million values.
odfdt_vis = odfdt[odfdt['handwashing_facilities'].notna()]
px.scatter(odfdt_vis,x='crude_mortality',y='handwashing_facilities',
color='location', hover_data=['location'],
title='Handwashing facilities and crude mortality')
The data shows that having more handwashing facilities doesn't necessarily indicate better COVID19 outcomes
px.scatter(odfdt,x='crude_mortality',y='median_age',
color='location', hover_data=['location'],
title='median age and crude mortality')
odft[odft.crude_mortality > 5.644496e-05].count() #50% of the target values to bucketize the output into 2 groups for classifying
#bucketized column
odft['outcomes'] = pd.cut(odft['crude_mortality'],[0,5.644496e-05,1],labels = [0,1])
odft.outcomes.value_counts()
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa import stattools
from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import acorr_ljungbox
from sklearn.metrics import mean_squared_error
from pandas.core.nanops import nanmean as pd_nanmean
from sklearn.linear_model import LinearRegression
odf.describe()
#rolling average for each variable
def cra(country,case='total_cases'):
ts=odf.loc[(odf['location']==country)]
ts=ts[[case]]
a=len(ts.loc[(ts[case]>=10)])
ts=ts[-a:]
ts.astype('int64')
return (ts.rolling(window=7,center=False).mean().dropna())
def crplot(country, case='total_cases'):
ts=odf.loc[(odf['location']==country)]
ts=ts[[case]]
a=len(ts.loc[(ts[case]>=10)])
ts=ts[-a:]
ts.astype('int64')
plt.figure(figsize=(16,6))
plt.plot(ts[case])
plt.plot(ts.rolling(window=7,center=False).mean().dropna(),label='Rolling Mean')
plt.legend(['Cases', 'Rolling Mean'])
plt.title('Cases distribution in %s with rolling mean and standard' %country)
crplot('South America', case='total_cases_per_million')
c1 = cra('Canada')
crplot('Canada')
crplot('United States')
odfcra = pd.DataFrame(cra('Canada'))
fig = sm.tsa.seasonal_decompose(cra('Canada').values, period = 30).plot()
def stationarity(ts):
print('Augmented Dickey-Fuller test: Canada\'s total_cases')
test = adfuller(ts, autolag='AIC')
results = pd.Series(test[0:4], index=['Test Statistic','p-value','Lags Used','Number of Observations'])
print(results)
stationarity(c1.total_cases.values)
The p-value tells us the likelihood of stationarity. Because this p-value is over the alpha limit, data may be non-stationary. ARIMA is a good model in this case
stationarity(cra('India').total_cases.values)
crplot('India')
stationarity(cra('China').total_cases.values)
crplot('China')
def autocorr(ts):
plt.figure(figsize=(12,5))
layout = (1, 2)
ax_acf= plt.subplot2grid(layout, (0, 0))
ax_pacf = plt.subplot2grid(layout, (0, 1))
plot_acf(ts, lags=15, title='ACF', ax=ax_acf)
plot_pacf(ts, lags=15, title='PACF', ax=ax_pacf)
plt.tight_layout()
autocorr(cra('Canada'))
import itertools
import warnings
warnings.simplefilter('ignore')
import itertools
import warnings
warnings.simplefilter('ignore')
def split(ts):
size = int(len(ts) * 0.85)
train = ts[:size]
test = ts[size:]
return(train,test)
#pDq hyperparameter optimization
def arima(ts,test):
p = d = q = range(0,6)
x = 99999
pdq = list(itertools.product(p,d,q))
for combo in pdq:
try:
model = ARIMA(ts, order=combo)
result = model.fit()
if (result.aic <= x):
x = result.aic
param = combo
except:
continue
return param
c1 = cra('Canada',case='total_cases')
c2 = cra('Canada',case='total_deaths')
train,test=split(c2.total_deaths.values)
param=arima(train,test)
#Modeling
model_totaldeaths = ARIMA(train, order=param)
result_totaldeaths = model.fit()
train,test=split(c1.total_cases.values)
param=arima(train,test)
#Modeling
model_totalcases = ARIMA(train, order=param)
result_totalcases = model.fit()
def rmse(y1, y_pred):
y1, y_pred = np.array(y1), np.array(y_pred)
rmse = sqrt(mean_squared_error(y1, y_pred))
print('Test RMSE: %.3f' % rmse)
rmse(test, pred)
plt.figure(figsize=(15,10))
layout=(1,1)
forecast = plt.subplot2grid(layout, (0,0))
plt.setp(forecast, xticks=[0,100,200,300,400,460], xticklabels=['March 1st, 2020','','','','April 1st, 2021'])
result.plot_predict(start=10, end=460, ax=forecast)
forecast.legend(fontsize='xx-large', loc='upper left')
forecast.tick_params(axis='x', labelsize=15)
forecast.tick_params(axis='y', labelsize=13)
forecast.legend(['forecast','actual','95% CI'], loc = 'upper left', fontsize='xx-large')
forecast.set_ylabel('COVID19 Deaths', fontsize=15)
forecast.set_title('ARIMA Model Predictions for COVID19 Deaths in Canada', fontsize=20 )
pred=result.forecast(steps=len(test))[0]
fig, ax = plt.subplots(1, 2, figsize=(16,4))
ax[0].plot(pred, c='g', label = 'Predicted')
ax[0].plot(test, c='r', label = 'Actual')
ax[1].plot(pred-test, c='b', label = 'Residuals', marker='*')
ax[0].legend(loc='upper left', fontsize='x-large')
ax[1].legend(loc='upper left', fontsize='x-large')
plt.setp(ax[0], xticks=[0, 10, 20, 30, 40, 50,56], xticklabels=['February 1st, 2021','','','March 1st, 2021','','','April 1st, 2021'])
ax[1].tick_params(axis='x', labelsize=13)
fig.suptitle('ARIMA Model error', fontsize=20 )
print(result_totalcases.summary())
print(result_totaldeaths.summary())
#Most recent data
odfdt = odft.copy()
odfdt['date'] = pd.to_datetime(odfdt['date'])
cutoff_date = pd.Timestamp('2021-02-01 00:00:00');
#Shortened dataset to only the most recent date to remove all time-related values
odfdt = odfdt[odfdt.date == cutoff_date]
odft = odf_totalcases.copy()
list_nocol = [col for col in odft.columns if 'new' in col or 'per' in col or 'people' in col or 'aged' in col]
list_nocol = ['new_cases',
'new_cases_smoothed',
'new_deaths',
'new_deaths_smoothed',
'total_cases_per_million',
'new_cases_per_million',
'new_cases_smoothed_per_million',
'total_deaths_per_million',
'new_deaths_per_million',
'new_deaths_smoothed_per_million',
'new_tests',
'total_tests_per_thousand',
'new_tests_per_thousand',
'new_tests_smoothed',
'new_tests_smoothed_per_thousand',
'tests_per_case',
'people_vaccinated',
'people_fully_vaccinated',
'new_vaccinations',
'new_vaccinations_smoothed',
'total_vaccinations_per_hundred',
'people_vaccinated_per_hundred',
'people_fully_vaccinated_per_hundred',
'new_vaccinations_smoothed_per_million',
'aged_65_older',
'aged_70_older']
y=odft.total_deaths
X=odft[odft.columns.difference(list_nocol+['iso_code','location','continent','crude_mortality','total_deaths','positive_rate','population','total_cases'])]
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp = imp.fit(train_X)
train_X = imp.transform(train_X)
valimp = SimpleImputer(missing_values=np.nan, strategy='median')
valimp = valimp.fit(val_X)
val_X = valimp.transform(val_X)
val_y = val_y.fillna(val_y.median())
train_y = train_y.fillna(train_y.median())
clf = RandomForestRegressor(n_estimators=100, bootstrap = True, oob_score = True, random_state = 1)
clf = clf.fit(train_X, train_y)
print('R^2 training set: {:.2f} \nR^2 val set: {:.2f}'.format(clf.score(train_X, train_y),clf.score(val_X, val_y)))
val_pred = clf.predict(val_X)
val_mae = mean_absolute_error(val_pred, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))
Extremely high R^2 score and very low MAE suggests severe overfitting of the model
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],axis=0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range (train_X.shape[1]):
featurelist = []
featurelist.append(X.columns[indices[f]])
print(f + 1,"\t", X.columns[indices[f]], importances[indices[f]])
plt.figure()
plt.title("Feature importances")
plt.bar(range(train_X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(train_X.shape[1]), indices)
plt.xlim([-1, train_X.shape[1]])
plt.show()
The model itself is too biased for these feature importances to hold significance.
y=odft.outcomes
X=odft[odft.columns.difference(['total_deaths_per_million','total_deaths','total_cases_per_million','date',
'tests_units','continent','reproduction_rate','crude_mortality','location','outcomes'])]
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp = imp.fit(train_X)
train_X = imp.transform(train_X)
valimp = SimpleImputer(missing_values=np.nan, strategy='median')
valimp = valimp.fit(val_X)
val_X = valimp.transform(val_X)
##train_y2 = train_y.fillna(train_y.median())
model = sm.OLS(train_y, train_X).fit()
predictions = model.predict(train_X)
model.summary()
print(X.columns[8], X.columns[7], X.columns[2])
life expectancy, human_development_index and extreme_poverty look like the strongest candidates for feature importance from the linear regression, however these results should be interpreted with caution.
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
#most recent date data
y=odfdt.outcomes
X=odfdt[odfdt.columns.difference(['total_deaths_per_million','total_cases_per_million','date',
'tests_units','continent','reproduction_rate','crude_mortality','location','outcomes'])]
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp = imp.fit(train_X)
train_X = imp.transform(train_X)
valimp = SimpleImputer(missing_values=np.nan, strategy='median')
valimp = valimp.fit(val_X)
val_X = valimp.transform(val_X)
lrModel = LogisticRegression()
lrModel.fit(train_X, train_y)
y_pred = lrModel.predict(val_X)
print('Accuracy: {:.2f}'.format(lrModel.score(val_X, val_y)))
print(classification_report(val_y, y_pred))
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
#Most recent data
odfdt = odft.copy()
odfdt['date'] = pd.to_datetime(odfdt['date'])
cutoff_date = pd.Timestamp('2021-02-01 00:00:00');
#Shortened dataset to only the most recent date to remove all time-related values
odfdt = odfdt[odfdt.date == cutoff_date]
y=odfdt.outcomes
X=odfdt[odfdt.columns.difference(['total_deaths_per_million','total_cases_per_million','date',
'tests_units','continent','reproduction_rate','crude_mortality','location','outcomes'])]
sc = StandardScaler()
X_train = sc.fit_transform(train_X)
X_test = sc.transform(val_X)
kNNmodel = KNeighborsClassifier(n_neighbors = 5)
kNNmodel.fit(X_train, train_y)
y_pred = kNNmodel.predict(X_test)
from sklearn.metrics import classification_report
cmatrix = confusion_matrix(val_y, y_pred)
cmatrix
print(classification_report(val_y, y_pred))
y=odft.outcomes
X=odft[odft.columns.difference(['total_deaths_per_million','total_cases_per_million','date',
'tests_units','continent','reproduction_rate','crude_mortality','location','outcomes'])]
sc = StandardScaler()
X_train = sc.fit_transform(train_X)
X_test = sc.transform(val_X)
kNNmodel = KNeighborsClassifier(n_neighbors = 50)
kNNmodel.fit(X_train, train_y)
y_pred = kNNmodel.predict(X_test)
from sklearn.metrics import classification_report
cmatrix = confusion_matrix(val_y, y_pred)
cmatrix
print(classification_report(val_y, y_pred))
#overfitting again